There are five primary dplyr verbs, representing distinct data analysis tasks:
data(french_fries, package = "reshape2")
french_fries %>%
filter(subject == 3, time == 1)
#> time treatment subject rep potato buttery grassy rancid painty
#> 1 1 1 3 1 2.9 0.0 0.0 0.0 5.5
#> 2 1 1 3 2 14.0 0.0 0.0 1.1 0.0
#> 3 1 2 3 1 13.9 0.0 0.0 3.9 0.0
#> 4 1 2 3 2 13.4 0.1 0.0 1.5 0.0
#> 5 1 3 3 1 14.1 0.0 0.0 1.1 0.0
#> 6 1 3 3 2 9.5 0.0 0.6 2.8 0.0
french_fries %>%
arrange(desc(rancid)) %>%
head
#> time treatment subject rep potato buttery grassy rancid painty
#> 1 9 2 51 1 7.3 2.3 0 14.9 0.1
#> 2 10 1 86 2 0.7 0.0 0 14.3 13.1
#> 3 5 2 63 1 4.4 0.0 0 13.8 0.6
#> 4 9 2 63 1 1.8 0.0 0 13.7 12.3
#> 5 5 2 19 2 5.5 4.7 0 13.4 4.6
#> 6 4 3 63 1 5.6 0.0 0 13.3 4.4
french_fries %>%
select(time, treatment, subject, rep, potato) %>%
head
#> time treatment subject rep potato
#> 61 1 1 3 1 2.9
#> 25 1 1 3 2 14.0
#> 62 1 1 10 1 11.0
#> 26 1 1 10 2 9.9
#> 63 1 1 15 1 1.2
#> 27 1 1 15 2 8.8
french_fries %>%
summarise(mean_rancid = mean(rancid, na.rm=TRUE),
sd_rancid = sd(rancid, na.rm = TRUE))
#> mean_rancid sd_rancid
#> 1 3.85223 3.781815
french_fries %>%
group_by(time, treatment) %>%
summarise(mean_rancid = mean(rancid), sd_rancid = sd(rancid))
#> Source: local data frame [30 x 4]
#> Groups: time [?]
#>
#> time treatment mean_rancid sd_rancid
#> <fctr> <fctr> <dbl> <dbl>
#> 1 1 1 2.758333 3.212870
#> 2 1 2 1.716667 2.714801
#> 3 1 3 2.600000 3.202037
#> 4 2 1 3.900000 4.374730
#> 5 2 2 2.141667 3.117540
#> 6 2 3 2.495833 3.378767
#> 7 3 1 4.650000 3.933358
#> 8 3 2 2.895833 3.773532
#> 9 3 3 3.600000 3.592867
#> 10 4 1 2.079167 2.394737
#> # ... with 20 more rows
to answer these french fry experiment questions:
If the data is complete it should be 12 x 10 x 3 x 2, that is, 6 records for each person. (Assuming that each person rated on all scales.)
To check this we want to tabulate the number of records for each subject, time and treatment. This means select appropriate columns, tabulate, count and spread it out to give a nice table.
french_fries %>%
select(subject, time, treatment) %>%
tbl_df() %>%
count(subject, time) %>%
spread(time, n)
#> Source: local data frame [12 x 11]
#> Groups: subject [12]
#>
#> subject 1 2 3 4 5 6 7 8 9 10
#> * <fctr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 3 6 6 6 6 6 6 6 6 6 NA
#> 2 10 6 6 6 6 6 6 6 6 6 6
#> 3 15 6 6 6 6 6 6 6 6 6 6
#> 4 16 6 6 6 6 6 6 6 6 6 6
#> 5 19 6 6 6 6 6 6 6 6 6 6
#> 6 31 6 6 6 6 6 6 6 6 NA 6
#> 7 51 6 6 6 6 6 6 6 6 6 6
#> 8 52 6 6 6 6 6 6 6 6 6 6
#> 9 63 6 6 6 6 6 6 6 6 6 6
#> 10 78 6 6 6 6 6 6 6 6 6 6
#> 11 79 6 6 6 6 6 6 6 6 6 NA
#> 12 86 6 6 6 6 6 6 6 6 NA 6
french_fries %>%
gather(type, rating, -subject, -time, -treatment, -rep) %>%
select(subject, time, treatment, type) %>%
tbl_df() %>%
count(subject, time) %>%
spread(time, n)
#> Source: local data frame [12 x 11]
#> Groups: subject [12]
#>
#> subject 1 2 3 4 5 6 7 8 9 10
#> * <fctr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 3 30 30 30 30 30 30 30 30 30 NA
#> 2 10 30 30 30 30 30 30 30 30 30 30
#> 3 15 30 30 30 30 30 30 30 30 30 30
#> 4 16 30 30 30 30 30 30 30 30 30 30
#> 5 19 30 30 30 30 30 30 30 30 30 30
#> 6 31 30 30 30 30 30 30 30 30 NA 30
#> 7 51 30 30 30 30 30 30 30 30 30 30
#> 8 52 30 30 30 30 30 30 30 30 30 30
#> 9 63 30 30 30 30 30 30 30 30 30 30
#> 10 78 30 30 30 30 30 30 30 30 30 30
#> 11 79 30 30 30 30 30 30 30 30 30 NA
#> 12 86 30 30 30 30 30 30 30 30 NA 30
ff.m.av <- ff.m %>%
group_by(subject, time, type, treatment) %>%
summarise(rating=mean(rating))
ggplot(data=ff.m, aes(x=time, y=rating, colour=treatment)) +
facet_grid(subject~type) +
geom_line(data=ff.m.av, aes(group=treatment))
Write an answer to each of the questions:
Are replicates like each other? No.
ffp <- french_fries %>%
select(subject, time, treatment, rep, potato) %>%
spread(rep, potato) %>% tbl_df
ggplot(ffp, aes(x = `1`, y = `2`)) + geom_point()
ff_all <- ff.m %>% spread(rep, rating)
ggplot(ff_all, aes(x = `1`, y = `2`)) +
geom_point(aes(color=type))
Do ratings change over the weeks? Only for some subjects.
Why would we even do that???
Hans Rosling can explain that really well in his TED talk
gapminder also makes data available (in R through the package gapminder)
library(gapminder)
ggplot(data=gapminder, aes(x=year, y=lifeExp, group=country)) +
geom_line()
How would you describe this plot?
ggplot(data=gapminder, aes(x=year, y=lifeExp, group=country,
color=continent, shape=continent)) +
geom_line()
How about now?
purrr package has been unveiled in February (presented by Hadley Wickham at WOMBAT)tidyr and purrr, i.e. run the following code:library(purrr)
usa <- gapminder %>% filter(country=="United States")
head(usa)
#> # A tibble: 6 x 6
#> country continent year lifeExp pop gdpPercap
#> <fctr> <fctr> <dbl> <dbl> <int> <dbl>
#> 1 United States Americas 2 68.44 157553000 13990.48
#> 2 United States Americas 7 69.49 171984000 14847.13
#> 3 United States Americas 12 70.21 186538000 16173.15
#> 4 United States Americas 17 70.76 198712000 19530.37
#> 5 United States Americas 22 71.34 209896000 21806.04
#> 6 United States Americas 27 73.38 220239000 24072.63
ggplot(data=usa, aes(x=year, y=lifeExp)) +
geom_point() +
geom_smooth(method="lm")
usa.lm <- lm(lifeExp~year, data=usa)
usa.lm
#>
#> Call:
#> lm(formula = lifeExp ~ year, data = usa)
#>
#> Coefficients:
#> (Intercept) year
#> 68.0455 0.1842
Intercept is estimated life expectancy at 0 BC - let’s use 1950 for the first value:
gapminder <- gapminder %>% mutate(year = year-1950)
We don’t want to subset the data for every country …
nest() makes a data frame part of another data frame:
by_country <- gapminder %>% group_by(continent, country) %>% nest()
by_country %>% glimpse
#> Observations: 142
#> Variables: 3
#> $ continent <fctr> Asia, Europe, Africa, Africa, Americas, Oceania, Eu...
#> $ country <fctr> Afghanistan, Albania, Algeria, Angola, Argentina, A...
#> $ data <list> [<c("-1948", "-1943", "-1938", "-1933", "-1928", "-...
Each element of the data variable in by_country is a dataset:
head(by_country$data[[1]])
#> # A tibble: 6 x 4
#> year lifeExp pop gdpPercap
#> <dbl> <dbl> <int> <dbl>
#> 1 -1948 28.801 8425333 779.4453
#> 2 -1943 30.332 9240934 820.8530
#> 3 -1938 31.997 10267083 853.1007
#> 4 -1933 34.020 11537966 836.1971
#> 5 -1928 36.088 13079460 739.9811
#> 6 -1923 38.438 14880372 786.1134
lm(lifeExp~year, data=by_country$data[[1]])
#>
#> Call:
#> lm(formula = lifeExp ~ year, data = by_country$data[[1]])
#>
#> Coefficients:
#> (Intercept) year
#> 566.2475 0.2753
Now we are using the map function in the package purrr.
map allows us to apply a function to each element of a list.
by_country$model <- by_country$data %>%
purrr::map(~lm(lifeExp~year, data=.))
head(by_country)
#> # A tibble: 6 x 4
#> continent country data model
#> <fctr> <fctr> <list> <list>
#> 1 Asia Afghanistan <tibble [12 x 4]> <S3: lm>
#> 2 Europe Albania <tibble [12 x 4]> <S3: lm>
#> 3 Africa Algeria <tibble [12 x 4]> <S3: lm>
#> 4 Africa Angola <tibble [12 x 4]> <S3: lm>
#> 5 Americas Argentina <tibble [12 x 4]> <S3: lm>
#> 6 Oceania Australia <tibble [12 x 4]> <S3: lm>
summary(by_country$model[[1]])
#>
#> Call:
#> lm(formula = lifeExp ~ year, data = .)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -1.5447 -0.9905 -0.2757 0.8847 1.6868
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 566.24755 39.27760 14.42 5.12e-08 ***
#> year 0.27533 0.02045 13.46 9.84e-08 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 1.223 on 10 degrees of freedom
#> Multiple R-squared: 0.9477, Adjusted R-squared: 0.9425
#> F-statistic: 181.2 on 1 and 10 DF, p-value: 9.835e-08
broom packagebroom allows to extract values from models on three levels:
broom::glancebroom::tidybroom::augmentbroom::glance(by_country$model[[1]])
#> r.squared adj.r.squared sigma statistic p.value df logLik
#> 1 0.9477123 0.9424835 1.222788 181.2494 9.835213e-08 2 -18.34693
#> AIC BIC deviance df.residual
#> 1 42.69387 44.14859 14.9521 10
broom::tidy(by_country$model[[1]])
#> term estimate std.error statistic p.value
#> 1 (Intercept) 566.2475466 39.27760415 14.41655 5.115882e-08
#> 2 year 0.2753287 0.02045093 13.46289 9.835213e-08
broom::augment(by_country$model[[1]])
#> lifeExp year .fitted .se.fit .resid .hat .sigma
#> 1 28.801 -1948 29.90729 0.6639995 -1.10629487 0.29487179 1.211813
#> 2 30.332 -1943 31.28394 0.5799442 -0.95193823 0.22494172 1.237512
#> 3 31.997 -1938 32.66058 0.5026799 -0.66358159 0.16899767 1.265886
#> 4 34.020 -1933 34.03722 0.4358337 -0.01722494 0.12703963 1.288917
#> 5 36.088 -1928 35.41387 0.3848726 0.67413170 0.09906760 1.267003
#> 6 38.438 -1923 36.79051 0.3566719 1.64748834 0.08508159 1.154002
#> 7 39.854 -1918 38.16716 0.3566719 1.68684499 0.08508159 1.147076
#> 8 40.822 -1913 39.54380 0.3848726 1.27820163 0.09906760 1.208243
#> 9 41.674 -1908 40.92044 0.4358337 0.75355828 0.12703963 1.260583
#> 10 41.763 -1903 42.29709 0.5026799 -0.53408508 0.16899767 1.274051
#> 11 42.129 -1898 43.67373 0.5799442 -1.54472844 0.22494172 1.148593
#> 12 43.828 -1893 45.05037 0.6639995 -1.22237179 0.29487179 1.194109
#> .cooksd .std.resid
#> 1 2.427205e-01 -1.07742164
#> 2 1.134714e-01 -0.88428127
#> 3 3.603567e-02 -0.59530844
#> 4 1.653992e-05 -0.01507681
#> 5 1.854831e-02 0.58082792
#> 6 9.225358e-02 1.40857509
#> 7 9.671389e-02 1.44222437
#> 8 6.668277e-02 1.10129103
#> 9 3.165567e-02 0.65958143
#> 10 2.334344e-02 -0.47913530
#> 11 2.987950e-01 -1.43494020
#> 12 2.963271e-01 -1.19046907
Extract all countries automatically (hello map again!)
by_country$stats <- by_country$model %>% purrr::map(broom::tidy)
by_country_coefs <- by_country %>% unnest(stats)
coefs <- by_country_coefs %>%
select(country, continent, term, estimate) %>%
spread(term, estimate)
and finally, a visualization:
ggplot(data=coefs, aes(x=`(Intercept)`, y=year, colour=continent)) +
geom_point()
by_country <- by_country %>% unnest(model %>% purrr::map(broom::glance))
by_country$country <- reorder(by_country$country, by_country$r.squared)
ggplot(data=by_country, aes(x=country, y=r.squared, colour=continent)) +
geom_point() +
theme(axis.text.x=element_text(angle=-90, hjust=0)) +
scale_colour_brewer(palette="Dark2")
country_all <- by_country %>% unnest(data)
ggplot(data=filter(country_all, r.squared <= 0.45),
aes(x=year+1950, y=lifeExp)) +
geom_line() +
facet_wrap(~country)
What do the patterns mean?
ggplot(data=subset(country_all, between(r.squared, 0.45, 0.75)),
aes(x=year+1950, y=lifeExp)) +
geom_line() +
facet_wrap(~country)
extract residuals for each of the models and store it in a dataset together with country and continent information
plot residuals across the years and fit a smooth. What does the pattern mean?
by_country.back <- by_country
by_country$augment <- by_country$model %>% purrr::map(broom::augment)
country_all <- by_country %>% unnest(data, augment)
ggplot(country_all, aes(x=lifeExp, y=.resid)) + geom_point() +
geom_smooth()
Overall, a linear fit is not great: at low and high life expectancy, linear fit is too fast
biobroom package (Bioconductor; Bass, Nelson, Robinson, Storey, 2016) has the same functions as broom, i.e. glance, tidy, and augment.
These functions provide information depending on the input type
library(biobroom)
data(hammer)
counts <- Biobase::exprs(hammer)
head(counts)
#> SRX020102 SRX020103 SRX020104 SRX020105 SRX020091-3
#> ENSRNOG00000000001 2 4 18 24 7
#> ENSRNOG00000000007 4 1 3 1 5
#> ENSRNOG00000000008 0 1 4 2 0
#> ENSRNOG00000000009 0 0 0 0 0
#> ENSRNOG00000000010 19 10 19 13 50
#> ENSRNOG00000000012 7 5 1 0 31
#> SRX020088-90 SRX020094-7 SRX020098-101
#> ENSRNOG00000000001 4 93 77
#> ENSRNOG00000000007 4 9 4
#> ENSRNOG00000000008 5 2 6
#> ENSRNOG00000000009 0 0 0
#> ENSRNOG00000000010 57 45 58
#> ENSRNOG00000000012 26 12 9
published as part of the biobroom package, part of the ReCount project
# more information about the study:
Biobase::phenoData(hammer)@data
#> sample.id num.tech.reps protocol strain Time
#> SRX020102 SRX020102 1 control Sprague Dawley 2 months
#> SRX020103 SRX020103 2 control Sprague Dawley 2 months
#> SRX020104 SRX020104 1 L5 SNL Sprague Dawley 2 months
#> SRX020105 SRX020105 2 L5 SNL Sprague Dawley 2months
#> SRX020091-3 SRX020091-3 1 control Sprague Dawley 2 weeks
#> SRX020088-90 SRX020088-90 2 control Sprague Dawley 2 weeks
#> SRX020094-7 SRX020094-7 1 L5 SNL Sprague Dawley 2 weeks
#> SRX020098-101 SRX020098-101 2 L5 SNL Sprague Dawley 2 weeks
identify differentially expressed genes following the protocol by Robinson, McCarthy and Smyth 2009
library(edgeR)
y <- DGEList(counts = counts, group=Biobase::phenoData(hammer)@data$protocol)
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)
et <- exactTest(y)
glance(et, alpha = 0.05) # glance on DGEExact
#> significant comparison
#> 1 6993 control/L5 SNL
tet <- tidy(et)
tet$significant <- tet$p.value < 0.05
ggplot(data=tet, aes(x=logCPM, y=estimate, colour=significant)) +
geom_point(alpha=.1) + facet_wrap(~significant, labeller="label_both")
augment(y) stops in an error (this is a bug and reported)
bb <- data.frame(read_tsv("./data/biotin-rma2.txt"))
head(data.frame(bb[,-2]))
#> Gene biotin.WT.01.1 biotin.bio101.4 biotin.WT.B1.2 biotin.bio1B1.3
#> 1 11986_at 6.359180 6.004075 6.325338 6.255888
#> 2 11987_at 7.833549 6.666034 7.738628 7.691321
#> 3 11988_at 6.145599 6.128749 6.258157 6.353912
#> 4 11989_at 5.675039 5.078695 5.444967 5.380756
#> 5 11990_at 5.382078 5.241411 5.209638 5.357740
#> 6 11991_g_at 5.389809 5.087797 5.403933 5.592901
#> biotin.WT.02.1 biotin.bio102.4 biotin.WT.B2.2 biotin.bio1B2.3
#> 1 6.554727 6.144557 6.098169 6.201624
#> 2 7.461711 7.300383 7.353033 7.298990
#> 3 6.097703 6.183593 6.184723 6.276867
#> 4 5.258524 5.328962 5.460898 5.203674
#> 5 5.337013 5.276889 5.434531 5.362002
#> 6 5.571362 5.249382 5.596683 5.140585
row.names(bb) <- bb$Gene
ggpairs(bb, columns=c(3,7,4,8))
sub <- bb %>% select(Gene, biotin.WT.01.1, biotin.WT.02.1, biotin.bio101.4, biotin.bio102.4)
ggplot(sub, aes(x=biotin.WT.01.1, xend=biotin.WT.02.1, y=biotin.bio101.4, yend=biotin.bio102.4)) +
geom_segment() +
theme(aspect.ratio = 1) +
xlab("wildtype, control treatment") +
ylab("mutant, treated")
design <- expand.grid(type=c("wild", "mutant"), trt=c("control", "treatment"), rep=1:2)
fit <- lmFit(bb[,-(1:2)], model.matrix(~ type*trt, design))
fit <- eBayes(fit)
head(topTable(fit))
#> typemutant trttreatment typemutant.trttreatment AveExpr
#> 13212_s_at 3.436575 0.1501056 -3.282545 6.383540
#> 13449_at 2.498316 0.1163889 -2.488131 5.287509
#> 14609_at 2.301786 -0.2724221 -1.962619 5.406537
#> 16016_at -3.749019 1.2864088 3.338380 7.858121
#> 18255_at 1.696560 -0.3977035 -1.326075 6.844090
#> 15162_at 2.699042 0.1022273 -2.354713 8.408859
#> F P.Value adj.P.Val
#> 13212_s_at 297.14511 8.048043e-08 0.0006677461
#> 13449_at 154.05666 8.051126e-07 0.0033400095
#> 14609_at 129.24605 1.484583e-06 0.0041058604
#> 16016_at 108.19217 2.752579e-06 0.0048917219
#> 18255_at 106.07250 2.947886e-06 0.0048917219
#> 15162_at 91.58383 4.898100e-06 0.0067732558
For the previous example, try out what output the different broom functions (glance, tidy, augment) produce. Create a Volcano plot for each of the model terms, i.e. plot estimates on x by log(p.values) on y. Are there differences visible between the terms?
head(tidy(fit))
#> # A tibble: 6 x 6
#> gene term estimate statistic p.value lod
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 11986_at typemutant -0.38263784 -2.9258469 0.02180752 -3.500735
#> 2 11987_at typemutant -0.66442177 -2.5373489 0.03836594 -4.069831
#> 3 11988_at typemutant 0.03451959 0.3393213 0.74418455 -6.556257
#> 4 11989_at typemutant -0.26295371 -1.5704944 0.15969599 -5.437119
#> 5 11990_at typemutant -0.10039542 -0.8814401 0.40693026 -6.207405
#> 6 11991_g_at typemutant -0.31199630 -1.7904790 0.11590003 -5.142913
ggplot(tidy(fit), aes(x=estimate, y=log(p.value), colour = p.value < 0.05)) +
facet_wrap(~term) +
geom_point() + ggtitle("Volcano Plots with limma")
bbfit <- tidy(fit)
ggplot(data=bbfit, aes(x=term, y=estimate, group=gene)) +
geom_line(alpha=0.1) +
geom_point(aes(color=log(p.value)), size=2, alpha=0.6)
Is type*treatment interaction necessary? Very strong negative correlation is suspicious.
bbfit_m <- bbfit %>% select(gene, term, estimate, p.value) %>%
gather(fit.stat, value, -gene, -term) %>%
unite(term_stat, term, fit.stat) %>%
spread(term_stat, value) %>%
rename(trt=trttreatment_estimate, mut=typemutant_estimate,
int=`typemutant:trttreatment_estimate`,
trtp=trttreatment_p.value, mutp=typemutant_p.value,
intp=`typemutant:trttreatment_p.value`)
ggpairs(bbfit_m, columns=c(2,4,6), upper=list(continuous="points"),
ggplot2::aes(colour=intp)) + theme(aspect.ratio=1)
ggpairs(bbfit_m, columns=c(2,4,6), upper=list(continuous="points"),
ggplot2::aes(colour=intp)) + theme(aspect.ratio=1)
fit2 <- lmFit(bb[,-(1:2)], model.matrix(~ type+trt, design))
fit2 <- eBayes(fit2)
bbfit2 <- tidy(fit2)
ggplot(data=bbfit2, aes(x=term, y=estimate, group=gene)) +
geom_line(alpha=0.1) +
geom_point(aes(color=log(p.value)), size=2, alpha=0.6)